Univariate Exploration

Load the Dataset

# Load dataset

# Load necessary libraries
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(dplyr)  # %>% 
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Read the CSV file from Google Drive
url <- "https://drive.google.com/uc?export=download&id=12CriTyCML_9rGAySnmcitxXbij7N11T0"
my_data <- read_csv(url)
## Rows: 920 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): sex, dataset, cp, restecg, slope, thal
## dbl (8): id, age, trestbps, chol, thalch, oldpeak, ca, num
## lgl (2): fbs, exang
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows
head(my_data)
#Removing Unnecessary Columns

my_data <- my_data %>%
  select(-id, -dataset)
print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
#Checking for Missing Data

my_data[!complete.cases(my_data),]
#Handling Missing Values

my_data[is.na(my_data$trestbps),]
# Calculate the median
trestbps_medv <- median(my_data$trestbps, na.rm = TRUE)
print(trestbps_medv)
## [1] 130
# Replace NA values with the median
my_data$trestbps[is.na(my_data$trestbps)] <- trestbps_medv

# Print updated data
print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$trestbps),]
# Handling Missing Values

my_data[is.na(my_data$chol),]
chol_meanv<- mean(my_data$chol, na.rm=TRUE)
print(chol_meanv)
## [1] 199.1303
my_data$chol[is.na(my_data$chol)] <- chol_meanv

print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$chol),]
# Handling Missing Values

my_data[is.na(my_data$thalch),]
thalch_meanv<- mean(my_data$thalch, na.rm=TRUE)
print(thalch_meanv)
## [1] 137.5457
my_data$thalch[is.na(my_data$thalch)] <- thalch_meanv

print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$thalch),]
# Handling Missing Values

my_data[is.na(my_data$ca),]
# Find the most frequent value (mode) for 'ca'
mode_ca <- as.numeric(names(sort(table(my_data$ca), decreasing = TRUE))[1])
print(mode_ca)
## [1] 0
my_data$ca[is.na(my_data$ca)] <- mode_ca


print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$ca),]
# Handling Missing Values

my_data[is.na(my_data$thal),]
# Find the most frequent value (mode) for 'thal'
mode_thal <- as.character(names(sort(table(my_data$thal), decreasing = TRUE))[1])
print(mode_thal)
## [1] "normal"
my_data$thal[is.na(my_data$thal)] <- mode_thal


print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$thal),]
# Handling Missing Values

my_data[is.na(my_data$slope),]
# Find the most frequent value (mode) for 'slope'
mode_slope <- names(sort(table(my_data$slope), decreasing = TRUE))[1]
print(mode_slope)
## [1] "flat"
my_data$slope[is.na(my_data$slope)] <- mode_slope


print(my_data)
## # A tibble: 920 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$slope),]
#Checking for Remaining Missing Data

my_data[!complete.cases(my_data),]
#Remove records with missing values
my_data<-my_data[complete.cases(my_data),]
print(my_data)
## # A tibble: 770 × 14
##      age sex    cp       trestbps  chol fbs   restecg thalch exang oldpeak slope
##    <dbl> <chr>  <chr>       <dbl> <dbl> <lgl> <chr>    <dbl> <lgl>   <dbl> <chr>
##  1    63 Male   typical…      145   233 TRUE  lv hyp…    150 FALSE     2.3 down…
##  2    67 Male   asympto…      160   286 FALSE lv hyp…    108 TRUE      1.5 flat 
##  3    67 Male   asympto…      120   229 FALSE lv hyp…    129 TRUE      2.6 flat 
##  4    37 Male   non-ang…      130   250 FALSE normal     187 FALSE     3.5 down…
##  5    41 Female atypica…      130   204 FALSE lv hyp…    172 FALSE     1.4 upsl…
##  6    56 Male   atypica…      120   236 FALSE normal     178 FALSE     0.8 upsl…
##  7    62 Female asympto…      140   268 FALSE lv hyp…    160 FALSE     3.6 down…
##  8    57 Female asympto…      120   354 FALSE normal     163 TRUE      0.6 upsl…
##  9    63 Male   asympto…      130   254 FALSE lv hyp…    147 FALSE     1.4 flat 
## 10    53 Male   asympto…      140   203 TRUE  lv hyp…    155 TRUE      3.1 down…
## # ℹ 760 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[!complete.cases(my_data),]

Summary Statistics

# Summary of numeric variables
summary(my_data)
##       age            sex                 cp               trestbps    
##  Min.   :28.00   Length:770         Length:770         Min.   :  0.0  
##  1st Qu.:46.00   Class :character   Class :character   1st Qu.:120.0  
##  Median :54.00   Mode  :character   Mode  :character   Median :130.0  
##  Mean   :53.04                                         Mean   :132.8  
##  3rd Qu.:59.75                                         3rd Qu.:140.0  
##  Max.   :77.00                                         Max.   :200.0  
##       chol          fbs            restecg              thalch     
##  Min.   :  0.0   Mode :logical   Length:770         Min.   : 60.0  
##  1st Qu.:198.0   FALSE:656       Class :character   1st Qu.:120.0  
##  Median :228.5   TRUE :114       Mode  :character   Median :140.0  
##  Mean   :218.8                                      Mean   :138.6  
##  3rd Qu.:269.0                                      3rd Qu.:159.0  
##  Max.   :603.0                                      Max.   :202.0  
##    exang            oldpeak           slope                 ca       
##  Mode :logical   Min.   :-1.0000   Length:770         Min.   :0.000  
##  FALSE:466       1st Qu.: 0.0000   Class :character   1st Qu.:0.000  
##  TRUE :304       Median : 0.5000   Mode  :character   Median :0.000  
##                  Mean   : 0.8874                      Mean   :0.261  
##                  3rd Qu.: 1.5000                      3rd Qu.:0.000  
##                  Max.   : 6.2000                      Max.   :3.000  
##      thal                num        
##  Length:770         Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :1.0000  
##                     Mean   :0.9195  
##                     3rd Qu.:1.0000  
##                     Max.   :4.0000

Univariate Plots

hist(my_data$age, 
     main="Histogram of Age", 
     col="lightblue", 
     xlab="Age", 
     border="black")

hist(my_data$trestbps, 
     main="Histogram of trestbps ", 
     col="green", 
     xlab="trestbps", 
     border="black")

# Univariate Plots

hist(my_data$chol, 
     main="Histogram of chol", 
     col="yellow", 
     xlab="chol", 
     border="black")

Univariate Plots

hist(my_data$thalch, 
     main="Histogram of thalch ", 
     col="blue", 
     xlab="thalch", 
     border="black")

hist(my_data$oldpeak, 
     main="Histogram of oldpeak", 
     col="lightblue", 
     xlab="oldpeak", 
     border="black")

hist(my_data$ca, 
     main="Histogram of Chest pain", 
     col="lightblue", 
     xlab="Ca", 
     border="black")

hist(my_data$num, 
     main="Histogram of num", 
     col="lightblue", 
     xlab="num", 
     border="black")

# Convert 'restecg' to numeric
my_data$restecg <- as.numeric(factor(my_data$restecg, levels = c("normal", "lv hypertrophy", "st-t abnormality")))

# Convert 'fbs' (fasting blood sugar) to numeric
my_data$fbs <- ifelse(my_data$fbs == TRUE, 1, 0)

# Convert 'slope' to numeric
my_data$slope <- as.numeric(factor(my_data$slope, levels = c("upsloping", "flat", "downsloping")))



my_data$num <- as.factor(my_data$num)

# Update the levels of 'num' to represent meaningful categories
levels(my_data$num) <- c("No Heart Disease", "Heart Disease -Type1", "Heart Disease -Type2", "Heart Disease -Type3","Heart Disease -Type4")


str(my_data)
## tibble [770 × 14] (S3: tbl_df/tbl/data.frame)
##  $ age     : num [1:770] 63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : chr [1:770] "Male" "Male" "Male" "Male" ...
##  $ cp      : chr [1:770] "typical angina" "asymptomatic" "asymptomatic" "non-anginal" ...
##  $ trestbps: num [1:770] 145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num [1:770] 233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num [1:770] 1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num [1:770] 2 2 2 1 2 1 2 1 2 2 ...
##  $ thalch  : num [1:770] 150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : logi [1:770] FALSE TRUE TRUE FALSE FALSE FALSE ...
##  $ oldpeak : num [1:770] 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num [1:770] 3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : num [1:770] 0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : chr [1:770] "fixed defect" "normal" "reversable defect" "normal" ...
##  $ num     : Factor w/ 5 levels "No Heart Disease",..: 1 3 2 1 1 1 4 1 3 2 ...
# Print the first few rows to verify the new labels
head(my_data)
hist(my_data$restecg, 
     main="Histogram of restecg", 
     col="lightblue", 
     xlab="restecg", 
     border="black")

hist(my_data$fbs, 
     main="Histogram of fbs", 
     col="lightblue", 
     xlab="fbs", 
     border="black")

hist(my_data$slope, 
     main="Histogram of slope", 
     col="lightblue", 
     xlab="slope", 
     border="black")

# Density Plot (Smooth Distribution)
plot(density(my_data$age), 
     main="Density Plot of Age", 
     col="blue", 
     lwd=2)

# Density Plot (Smooth Distribution)
plot(density(my_data$trestbps), 
     main="Density Plot of trestbps", 
     col="blue", 
     lwd=2)

# Density Plot (Smooth Distribution)
plot(density(my_data$chol), 
     main="Density Plot of chol ", 
     col="blue", 
     lwd=2)

# Density Plot (Smooth Distribution)
plot(density(my_data$thalch), 
     main="Density Plot of thalch", 
     col="blue", 
     lwd=2)

# Density Plot (Smooth Distribution)
plot(density(my_data$oldpeak), 
     main="Density Plot of oldpeak", 
     col="blue", 
     lwd=2)

# Density Plot (Smooth Distribution)
plot(density(my_data$slope), 
     main="Density Plot of slope", 
     col="blue", 
     lwd=2)

# Density Plot (Smooth Distribution)
plot(density(my_data$restecg), 
     main="Density Plot of restecg", 
     col="blue", 
     lwd=2)

# Boxplot 
boxplot(my_data$age, 
        main="Box Plot of Age", 
        col="orange")

# Boxplot 
boxplot(my_data$trestbps, 
        main="Box Plot of trestbps ", 
        col="orange")

# Boxplot 
boxplot(my_data$chol, 
        main="Box Plot of chol", 
        col="orange")

# Boxplot 
boxplot(my_data$restecg, 
        main="Box Plot of restecg ", 
        col="orange")

# Boxplot 
boxplot(my_data$thalch, 
        main="Box Plot of thalch", 
        col="orange")

# Boxplot 
boxplot(my_data$oldpeak, 
        main="Box Plot of oldpeak", 
        col="orange")

Exploring Categorical Data

# Count of each num of affected people 
table(my_data$num)
## 
##     No Heart Disease Heart Disease -Type1 Heart Disease -Type2 
##                  375                  210                   81 
## Heart Disease -Type3 Heart Disease -Type4 
##                   80                   24
# Bar plot for count
barplot(table(my_data$num), 
        main="Count of num", 
        col=c("red", "green", "blue"))

# Bar plot for  count
barplot(table(my_data$sex), 
        main="Count of sex", 
        col=c("red", "green", "blue"))

# Bar plot for  count
barplot(table(my_data$cp), 
        main="Count of cp", 
        col=c("red", "green", "blue"))

# Bar plot for  count
barplot(table(my_data$trestbps), 
        main="Count of trestbps", 
        col=c("red", "green", "blue"))

# Bar plot for count
barplot(table(my_data$slope), 
        main="Count of slope", 
        col=c("red", "green", "blue"))

# Bar plot for Species count
barplot(table(my_data$thal), 
        main="Count of thal", 
        col=c("red", "green", "blue"))

Multivariate Exploration

Scatter Plot (Numeric vs. Numeric)

# Scatter plot: age vs trestbps
plot(my_data$age, my_data$trestbps, 
     main="Scatter Plot of Age vs trestbps",
     xlab="age", ylab="trestbps",
     col=my_data$num, pch=20)
legend("bottomright", legend=levels(my_data$num), col=1:4, pch=19)

# Multivariate Exploration # Scatter Plot (Numeric vs. Numeric)

# Scatter plot: thalch vs chol
plot(my_data$thalch, my_data$chol, 
     main="Scatter Plot of thalch vs chol",
     xlab="thalch", ylab="chol",
     col=my_data$num, pch=20)
legend("topleft", legend=levels(my_data$num), col=1:4, pch=20)

# Scatter plot: thalch vs trestbps
plot(my_data$thalch, my_data$trestbps, 
     main="Scatter Plot of thalch vs trestbps",
     xlab="thalch", ylab="trestbps",
     col=my_data$num, pch=20)
legend("bottomleft", legend=levels(my_data$num), col=1:4, pch=20)

Correlation (Numeric vs. Numeric)

# Correlation between age and trestbs
cor(my_data$age, my_data$trestbps)
## [1] 0.2409653
# Correlation between age and trestbs
cor(my_data$age, my_data$chol)
## [1] -0.06950973
# Correlation between chol and trestbs
cor(my_data$chol, my_data$trestbps)
## [1] 0.0599012
# Correlation between thalch and trestbs
cor(my_data$thalch, my_data$trestbps)
## [1] -0.109774
# Correlation between age and thalch
cor(my_data$age, my_data$thalch)
## [1] -0.3739944
# Correlation between age and slope
cor(my_data$age, my_data$slope)
## [1] 0.06146084

Correlation matrix

# Select only numeric columns
numeric_cols <- sapply(my_data, is.numeric)
correlation_matrix <- cor(my_data[, numeric_cols], use = "complete.obs")

# Print the correlation matrix
print(correlation_matrix)
##                  age    trestbps        chol         fbs     restecg
## age       1.00000000  0.24096532 -0.06950973  0.22647439  0.20749775
## trestbps  0.24096532  1.00000000  0.05990120  0.15692273  0.10034657
## chol     -0.06950973  0.05990120  1.00000000  0.02907186 -0.06549605
## fbs       0.22647439  0.15692273  0.02907186  1.00000000  0.12153080
## restecg   0.20749775  0.10034657 -0.06549605  0.12153080  1.00000000
## thalch   -0.37399438 -0.10977400  0.19940868 -0.05949849 -0.09568953
## oldpeak   0.26091537  0.17080414  0.05526483  0.05546392  0.10888213
## slope     0.06146084  0.08097304 -0.05522410  0.06742084  0.05004836
## ca        0.24956877  0.02804014  0.15579945  0.07785460  0.03359935
##               thalch     oldpeak       slope          ca
## age      -0.37399438  0.26091537  0.06146084  0.24956877
## trestbps -0.10977400  0.17080414  0.08097304  0.02804014
## chol      0.19940868  0.05526483 -0.05522410  0.15579945
## fbs      -0.05949849  0.05546392  0.06742084  0.07785460
## restecg  -0.09568953  0.10888213  0.05004836  0.03359935
## thalch    1.00000000 -0.18268766 -0.31214858  0.03739947
## oldpeak  -0.18268766  1.00000000  0.31268535  0.23154038
## slope    -0.31214858  0.31268535  1.00000000 -0.10682672
## ca        0.03739947  0.23154038 -0.10682672  1.00000000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Calculate correlation matrix
numeric_cols <- sapply(my_data, is.numeric)
correlation_matrix <- cor(my_data[, numeric_cols], use = "complete.obs")

# Create heatmap with correlation values
corrplot(correlation_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         diag = FALSE)

library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(
  x = colnames(correlation_matrix),
  y = colnames(correlation_matrix),
  z = correlation_matrix,
  type = "heatmap",
  colorscale = "Viridis",
  hoverinfo = "x+y+z"
) %>% 
  layout(
    title = "Interactive Correlation Matrix",
    xaxis = list(tickangle = 45),
    margin = list(l = 100, r = 100, b = 150)
  )

Box Plot (Numeric vs. Categorical)

# Boxplot: Cholesterol by Heart Disease Status
boxplot(chol ~ num, data = my_data,
        main = "Boxplot of Cholesterol by Heart Disease Status",
        xlab = "Heart Disease",
        ylab = "Cholesterol",
        col = c("skyblue", "tomato"))

# Box Plot (Numeric vs. Categorical)

# Boxplot: Thalch by Sex
boxplot(thalch ~ sex, data = my_data,
        main = "Boxplot of Max Heart Rate by Sex",
        xlab = "Sex",
        ylab = "Max Heart Rate (thalch)",
        col = c("lightpink", "lightblue"))

# Box Plot (Numeric vs. Categorical)

# Boxplot: Oldpeak by Chest Pain Type
boxplot(oldpeak ~ cp, data = my_data,
        main = "Boxplot of ST Depression (Oldpeak) by Chest Pain Type",
        xlab = "Chest Pain Type",
        ylab = "Oldpeak",
        col = rainbow(length(unique(my_data$cp))))

# Box Plot (Numeric vs. Categorical)

# Boxplot: Trestbps by Slope
boxplot(trestbps ~ slope, data = my_data,
        main = "Boxplot of Resting BP by Slope of Peak Exercise ST",
        xlab = "Slope Type",
        ylab = "Resting Blood Pressure",
        col = c("orange", "cyan", "lightgreen"))

library(ggplot2)
ggplot(my_data, aes(x = as.factor(num), y = chol, fill = as.factor(num))) +
  geom_boxplot() +
  ggtitle("Cholesterol Levels by Heart Disease Class") +
  xlab("Heart Disease Class (num)") +
  ylab("Serum Cholesterol (mg/dl)") +
  theme_minimal()

install.packages(“ggplot2”) # Pair Plot (Multivariate Visualization)

pairs(my_data[, c("age", "trestbps", "chol", "thalch", "oldpeak")],
      col = my_data$num,
      pch = 19,
      main = "Pair Plot of Selected Features Colored by Heart Disease")

Using ggplot2 for Enhanced Visualization

library(ggplot2)

my_data$num <- as.factor(my_data$num)


ggplot(my_data, aes(x = age, y = thalch, color = num)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Age vs Max Heart Rate (thalch) by Heart Disease Class") +
  xlab("Age") +
  ylab("Max Heart Rate Achieved (thalch)")
## `geom_smooth()` using formula = 'y ~ x'

Using ggplot2 for Enhanced Visualization

ggplot(my_data, aes(x = trestbps, y = chol, color = num)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Resting Blood Pressure vs Cholesterol by Heart Disease Class") +
  xlab("Resting Blood Pressure (trestbps)") +
  ylab("Serum Cholesterol (chol)")
## `geom_smooth()` using formula = 'y ~ x'

Using ggplot2 for Enhanced Visualization

# Scatter plot with regression line for thalch vs chol
ggplot(my_data, aes(x = thalch, y = chol, color = num)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Maximum Heart Rate Achieved vs Cholesterol by Heart Disease Class") +
  xlab("Maximum Heart Rate Achieved (thalch)") +
  ylab("Serum Cholesterol (chol)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Using ggplot2 for Enhanced Visualization

# Load necessary libraries
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
# Select numeric columns for scaling
numeric_cols <- c("age", "trestbps", "chol", "thalch", "oldpeak")

# Apply standardization (z-score normalization)
preProcValues <- preProcess(my_data[, numeric_cols], method = c("center", "scale"))
my_data_scaled <- predict(preProcValues, my_data[, numeric_cols])

# Replace original columns with scaled versions
my_data[, numeric_cols] <- my_data_scaled

# Verify the transformation
summary(my_data[, numeric_cols])
##       age             trestbps            chol             thalch        
##  Min.   :-2.6527   Min.   :-7.1575   Min.   :-2.3638   Min.   :-3.04530  
##  1st Qu.:-0.7459   1st Qu.:-0.6883   1st Qu.:-0.2244   1st Qu.:-0.72072  
##  Median : 0.1015   Median :-0.1492   Median : 0.1052   Median : 0.05414  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.7106   3rd Qu.: 0.3899   3rd Qu.: 0.5428   3rd Qu.: 0.79026  
##  Max.   : 2.5380   Max.   : 3.6245   Max.   : 4.1519   Max.   : 2.45621  
##     oldpeak       
##  Min.   :-1.7406  
##  1st Qu.:-0.8184  
##  Median :-0.3573  
##  Mean   : 0.0000  
##  3rd Qu.: 0.5649  
##  Max.   : 4.8994
# Binary encoding for sex (Male=1, Female=0)
my_data$sex <- ifelse(my_data$sex == "Male", 1, 0)

# Ensure fbs is integer (fasting blood sugar > 120 mg/dl)
my_data$fbs <- as.integer(my_data$fbs)

# Convert target variable to factor
my_data$num <- as.factor(my_data$num)

# Identify numeric columns for scaling (excluding already encoded categoricals and target)
numeric_cols <- c("age", "trestbps", "chol", "thalch", "oldpeak", "ca")

# Apply standardization (z-score normalization)
library(caret)
preProcValues <- preProcess(my_data[, numeric_cols], method = c("center", "scale"))

# View the structure of your data
str(my_data)
## tibble [770 × 14] (S3: tbl_df/tbl/data.frame)
##  $ age     : num [1:770] 1.05 1.48 1.48 -1.7 -1.28 ...
##  $ sex     : num [1:770] 1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : chr [1:770] "typical angina" "asymptomatic" "asymptomatic" "non-anginal" ...
##  $ trestbps: num [1:770] 0.659 1.468 -0.688 -0.149 -0.149 ...
##  $ chol    : num [1:770] 0.154 0.727 0.111 0.338 -0.16 ...
##  $ fbs     : int [1:770] 1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num [1:770] 2 2 2 1 2 1 2 1 2 2 ...
##  $ thalch  : num [1:770] 0.442 -1.186 -0.372 1.875 1.294 ...
##  $ exang   : logi [1:770] FALSE TRUE TRUE FALSE FALSE FALSE ...
##  $ oldpeak : num [1:770] 1.303 0.565 1.579 2.409 0.473 ...
##  $ slope   : num [1:770] 3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : num [1:770] 0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : chr [1:770] "fixed defect" "normal" "reversable defect" "normal" ...
##  $ num     : Factor w/ 5 levels "No Heart Disease",..: 1 3 2 1 1 1 4 1 3 2 ...
# Check the first few rows
head(my_data)
# Function to detect outliers using IQR method
detect_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  x < lower_bound | x > upper_bound
}

# Check for outliers in numeric columns
outliers_list <- lapply(my_data[, numeric_cols], detect_outliers)
outliers_count <- sapply(outliers_list, sum)
print(outliers_count)
##      age trestbps     chol   thalch  oldpeak       ca 
##        0       25       99        1       15      123
# Boxplots to visualize outliers
par(mfrow = c(2, 3))
for (col in numeric_cols) {
  boxplot(my_data[[col]], main = paste("Boxplot of", col))
}

par(mfrow = c(1, 1))
# 1. Robust Scaling
library(caret)
robust_scaler <- preProcess(my_data[, numeric_cols],
                          method = c("center", "scale", "YeoJohnson"))
my_data[, numeric_cols] <- predict(robust_scaler, my_data[, numeric_cols])

# 2. Check for zero-IQR columns
zero_iqr_cols <- sapply(my_data[, numeric_cols], function(x) IQR(x, na.rm = TRUE) == 0)
if(any(zero_iqr_cols)) {
  message("Columns with IQR=0: ", paste(names(zero_iqr_cols)[zero_iqr_cols], collapse = ", "))

  valid_cols <- names(zero_iqr_cols)[!zero_iqr_cols]
}
## Columns with IQR=0: ca
# 3. Multivariate Outlier Detection (with fallback)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
if(length(valid_cols) > 1) {  # Need at least 2 columns for covariance
  tryCatch({
    mcd <- cov.mcd(my_data[, valid_cols])
    mahalanobis_dist <- mahalanobis(my_data[, valid_cols], 
                                  mcd$center, 
                                  mcd$cov)
    cutoff <- qchisq(0.99, df = length(valid_cols))
    outliers <- mahalanobis_dist > cutoff
    
    # 4. Handle outliers (capping approach)
    my_data[outliers, valid_cols] <- 
      apply(my_data[outliers, valid_cols], 2, 
            function(x) pmin(pmax(x, 
                                 quantile(x, 0.01, na.rm = TRUE)),
                            quantile(x, 0.99, na.rm = TRUE)))
  }, error = function(e) {
    message("MCD failed, using regular Mahalanobis: ", e$message)
    # Fallback to regular Mahalanobis
    mahalanobis_dist <- mahalanobis(my_data[, valid_cols], 
                                  colMeans(my_data[, valid_cols]), 
                                  cov(my_data[, valid_cols]))
    cutoff <- qchisq(0.99, df = length(valid_cols))
    outliers <- mahalanobis_dist > cutoff
  })
} else {
  message("Insufficient valid columns for multivariate outlier detection")
  outliers <- rep(FALSE, nrow(my_data))
}

# 5. Final Verification
summary(my_data[, numeric_cols])
##       age                trestbps              chol          
##  Min.   :-2.4825139   Min.   :-2.335582   Min.   :-2.045079  
##  1st Qu.:-0.7616226   1st Qu.:-0.673934   1st Qu.:-0.333804  
##  Median : 0.0639728   Median :-0.107625   Median : 0.001456  
##  Mean   : 0.0005325   Mean   : 0.008164   Mean   :-0.001047  
##  3rd Qu.: 0.6966808   3rd Qu.: 0.426799   3rd Qu.: 0.498388  
##  Max.   : 2.7240781   Max.   : 3.268579   Max.   : 5.241707  
##      thalch              oldpeak                ca        
##  Min.   :-2.6499355   Min.   :-1.997013   Min.   :-0.390  
##  1st Qu.:-0.7452664   1st Qu.:-0.956260   1st Qu.:-0.390  
##  Median : 0.0024077   Median :-0.111733   Median :-0.390  
##  Mean   :-0.0007927   Mean   : 0.001874   Mean   : 0.000  
##  3rd Qu.: 0.7790438   3rd Qu.: 0.880230   3rd Qu.:-0.390  
##  Max.   : 2.6990588   Max.   : 2.361611   Max.   : 4.092
boxplot(my_data[, numeric_cols], main = "Post-Processing Distributions")